# Purpose of the code:
# Compare the clinical data to device measurements
# necessary imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math as math
from scipy import stats
%matplotlib inline
# switch to a proper directory to access the data
pwd
cd /camdatalake/bronze/verily_ms/device/
pwd
# download and read the data
# FeatureDay: Average value of the features for each day of study. Days are listed as
# DayOfStudy
# FeatureStudy: Features for the entire study period.For the at home features,
# the reported value is the median of the observed day level values.
import gzip, csv
with gzip.open("FeaturesDay.csv.gz", "rt", newline="") as file:
FeatureDay = pd.read_csv(file, header = 0)
with gzip.open("FeaturesStudy.csv.gz", "rt", newline="") as file:
FeatureStudy = pd.read_csv(file, header = 0)
# exploring the dataset
FeatureDay.info()
FeatureDay.describe()
FeatureDay.head()
# extracting list of feature names in the dataset
list(FeatureDay.columns)
# found list of unique IDs for patients
patient_IDs = list(FeatureDay['gls_subject_code'].unique())
patient_IDs
# 10 free-living features with high correlation to MSFC
# ['idle_minutes',
# 'turn_vel_std_ankle',
# 'swing',
# 'stance',
# 'duration_movement_count',
# 'turn_vel_max_ankle',
# 'turn_duration_ankle',
# 'duration_rem_count',
# 'rem_percent',
# 'movement_rate']
free_living_features_highly_correlated = ['idle_minutes',
'turn_vel_std_ankle',
'swing',
'stance',
'duration_movement_count',
'turn_vel_max_ankle',
'turn_duration_ankle',
'duration_rem_count',
'rem_percent',
'movement_rate']
# 19 highly correlated at home features (structured activity) to MSFC
# ['mean_pvt_delay_7_at_home',
# 'mobility_stance_at_home',
# 'mean_pvt_delay_at_home',
# 'pq_nondominant_rhythm_at_home',
# 'pq_nondominant_median_at_home',
# 'pq_dominant_rhythm_at_home',
# 'turn_vel_max_at_home',
# 'mobility_swing_at_home',
# 'zx_dominant_num_correct_at_home',
# 'turn_vel_std_at_home',
# 'turn_duration_ankle_at_home',
# 'turn_vel_max_ankle_at_home',
# 'mean_pvt_delay_5_at_home',
# 'zx_nondominant_median_at_home',
# 'zx_nondominant_num_correct_at_home',
# 'mean_pvt_delay_3_at_home',
# 'turn_vel_std_ankle_at_home',
# 'mobility_activity_at_home_time',
# 'mean_pvt_delay_1_at_home']
at_home_features_highly_correlated = ['mean_pvt_delay_7_at_home',
'mobility_stance_at_home',
'mean_pvt_delay_at_home',
'pq_nondominant_rhythm_at_home',
'pq_nondominant_median_at_home',
'pq_dominant_rhythm_at_home',
'turn_vel_max_at_home',
'mobility_swing_at_home',
'zx_dominant_num_correct_at_home',
'turn_vel_std_at_home',
'turn_duration_ankle_at_home',
'turn_vel_max_ankle_at_home',
'mean_pvt_delay_5_at_home',
'zx_nondominant_median_at_home',
'zx_nondominant_num_correct_at_home',
'mean_pvt_delay_3_at_home',
'turn_vel_std_ankle_at_home',
'mobility_activity_at_home_time',
'mean_pvt_delay_1_at_home']
# check the content of the directory
ls
# extract part of the FeatureDay realted to clinical visit_3
FeatureStudy_clinical_3_related = pd.read_csv('FeatureStudy_clinical_3_related')
clinical_3_related_col_names = list(FeatureStudy_clinical_3_related.columns)
FeatureDay_clinical_3_related = FeatureDay[['gls_subject_code','dayofstudy'] + clinical_3_related_col_names]
FeatureDay_clinical_3_related.isnull().sum().sum()
# extract part of the FeatureDay realted to clinical visit_2
FeatureStudy_clinical_2_related = pd.read_csv('FeatureStudy_clinical_2_related')
clinical_2_related_col_names = list(FeatureStudy_clinical_2_related.columns)
FeatureDay_clinical_2_related = FeatureDay[['gls_subject_code','dayofstudy'] + clinical_2_related_col_names]
FeatureDay_clinical_2_related.isnull().sum().sum()
# extract part of the FeatureDay realted to clinical visit_1
FeatureStudy_clinical_1_related = pd.read_csv('FeatureStudy_clinical_1_related')
clinical_1_related_col_names = list(FeatureStudy_clinical_1_related.columns)
FeatureDay_clinical_1_related = FeatureDay[['gls_subject_code','dayofstudy'] + clinical_1_related_col_names]
FeatureDay_clinical_1_related.isnull().sum().sum()
# extract part of the FeatureDay realted to all clinical visits
FeatureStudy_clinical_related = pd.read_csv('FeatureStudy_clinical_related')
clinical_related_col_names = list(FeatureStudy_clinical_related.columns)
FeatureDay_clinical_related = FeatureDay[['gls_subject_code','dayofstudy'] + clinical_related_col_names]
FeatureDay_clinical_related.isnull().sum().sum()
# extract part of the FeatureStudy realted to clinical visits
FeatureStudy_columns_names = list(FeatureStudy.columns)
FeatureStudy_clinical_1_col_names = []
FeatureStudy_clinical_2_col_names = []
FeatureStudy_clinical_3_col_names = []
FeatureStudy_clinical_col_names = []
for name in FeatureStudy_columns_names:
name_lower = name.lower()
if 'clinic_1' in name_lower:
FeatureStudy_clinical_1_col_names.append(name)
FeatureStudy_clinical_col_names.append(name)
if 'clinic_2' in name_lower:
FeatureStudy_clinical_2_col_names.append(name)
FeatureStudy_clinical_col_names.append(name)
if 'clinic_3' in name_lower:
FeatureStudy_clinical_3_col_names.append(name)
FeatureStudy_clinical_col_names.append(name)
FeatureStudy_clinical_1_related = FeatureStudy[['gls_subject_code'] + FeatureStudy_clinical_1_col_names]
FeatureStudy_clinical_2_related = FeatureStudy[['gls_subject_code'] + FeatureStudy_clinical_2_col_names]
FeatureStudy_clinical_3_related = FeatureStudy[['gls_subject_code'] + FeatureStudy_clinical_3_col_names]
# extract all clinical data
FeatureStudy_clinical_related = pd.concat([FeatureStudy_clinical_1_related,
FeatureStudy_clinical_2_related,
FeatureStudy_clinical_3_related],axis = 1)
print(FeatureStudy_clinical_1_related.shape)
print(FeatureStudy_clinical_2_related.shape)
print(FeatureStudy_clinical_3_related.shape)
print(FeatureStudy_clinical_related.shape)
FeatureStudy_clinical_1_related.head()
FeatureStudy_clinical_2_related.head()
FeatureStudy_clinical_3_related.head()
# at-home features (structures activity) have clinical measurments available in the dataset
# create a dataframe with all information from clinic2&3 for highly correlated at-home features (structures activities)
at_home_features_highly_correlated_clinical_2_3 = pd.DataFrame()
at_home_features_highly_correlated_clinical_2_3['patient_ID'] = patient_IDs
# loop on features
for feature in at_home_features_highly_correlated:
# extract values related to clinical visit 2 for a specific feature
feature_2 = feature.replace('at_home','clinic_2')
arr = []
for ID in patient_IDs:
val = FeatureStudy_clinical_2_related[FeatureStudy_clinical_2_related['gls_subject_code'] == ID][feature_2]
val = pd.Series.tolist(val)
arr = arr + val
at_home_features_highly_correlated_clinical_2_3[feature_2] = arr
# extract values related to clinical visit 3 for a specific feature
feature_3 = feature.replace('at_home','clinic_3')
arr = []
for ID in patient_IDs:
val = FeatureStudy_clinical_3_related[FeatureStudy_clinical_3_related['gls_subject_code'] == ID][feature_3]
val = pd.Series.tolist(val)
arr = arr + val
at_home_features_highly_correlated_clinical_2_3[feature_3] = arr
at_home_features_highly_correlated_clinical_2_3.shape
at_home_features_highly_correlated_clinical_2_3.head()
def remove_outliers(feature_values, day_of_study):
# a function to remove outliers from input dataset and return filtered dataset as the ouput
m = 2 # stance threshold from the mean
mean = feature_values.mean()
std = feature_values.std()
tuples = list(zip(feature_values,day_of_study))
filtered_values = []
for (x,y) in tuples:
if (x >= mean - m * std) & (x <= mean + m * std):
filtered_values.append((x,y))
unzip_filtered_values = list(zip(*filtered_values))
# check for missing values
if len(unzip_filtered_values) > 0:
return pd.Series(list(unzip_filtered_values[0])), pd.Series(list(unzip_filtered_values[1]))
else:
return pd.Series([]),day_of_study
def standardize_axis(feature):
# a function to standardize the axis
# remove outliers (both feature values & associated days of study), return filtered values
# use the filtered values to assign a range to axis
# we assume dataframes FeatureDay and patient_IDs are already defined
All_filtered_feature_values = []
All_filtered_days_of_studies = []
# loop on all the patients
for ID in patient_IDs:
# extract part of FeatureDay that is related to a patient and input feature as a new dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
# create list of x: days of study, y: feature values
x = df[col_2]
y = df[col_1]
# remove outliers (both feature values & associated day of study)
y,x = remove_outliers(y,x)
# store all the filtered values
All_filtered_feature_values = All_filtered_feature_values + (pd.Series.tolist(y))
All_filtered_days_of_studies = All_filtered_days_of_studies + (pd.Series.tolist(x))
# set the axis ranges to the max value in the list of filtered values
max_y = (np.max(All_filtered_feature_values))
max_x = (np.max(All_filtered_days_of_studies))
# return the extracted axis
return max_y,max_x
def plot_feature_across_days_and_clinical_visitis(feature):
# plot the measurments for a specific feature vs. days of study
# plot measurements for clinical visits 2 and 3 on a same graph
figs, axes = plt.subplots(nrows= 5, ncols= 5,figsize=(20,20),dpi = 200)
print(feature)
# plot the measurments vs. days of study
for idx in range(len(patient_IDs)):
# extract the patient ID
ID = patient_IDs[idx]
# extract two columns as a dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
x = df[col_2]
y = df[col_1]
# set the row and column numbers based on the fact that we have 25 patients
row = idx // 5
col = idx % 5
# standardize the axis
max_y,max_x = standardize_axis(feature)
axes[row,col].set_xlim(0, max_x)
# MSFC scores can be negative
# scale the y axis to consider negative MSFC scores
axes[row,col].set_ylim(0, max_y)
axes[row,col].set_title(ID,y=0.9)
axes[row,col].set_xlabel('Days of Study')
axes[row,col].set_ylabel(feature)
# plot the measurments vs. days
if (len(y.unique()) == 1) & (np.isnan(y.unique()).sum() == 1):
pass
else:
y,x = remove_outliers(y,x)
if len(y) == 0:
pass
else:
sns.regplot(x,y,ax=axes[row,col],label = 'device')
axes[row,col].set_xlabel('Days of Study')
axes[row,col].set_ylabel(feature)
# plot measurements for the same feature in clinical visits 2 & 3
col_1_new = feature.replace('at_home','clinic_2')
col_2_new = feature.replace('at_home','clinic_3')
x_new = [0,max_x]
y_new = list(at_home_features_highly_correlated_clinical_2_3[at_home_features_highly_correlated_clinical_2_3['patient_ID'] == ID][[col_1_new,col_2_new]].iloc[0])
axes[row,col].plot(x_new,y_new,label='clinical visit')
axes[row,col].legend(loc = 'best')
def clinical_visits_slope(col_1,col_2,max_x):
# get the slope of a line that can be fit to clinical values for a specific feature
# col_1 & col_2: column names of clinical visits in the dataframe
# max_x: length of study
# we assume the dataframe containing the clinical measurments already defined
slopes = []
# loop on patien IDs
for idx in range(len(patient_IDs)):
ID = patient_IDs[idx]
# create regression line, calculate the slope
x = [0,max_x]
y = list(at_home_features_highly_correlated_clinical_2_3[at_home_features_highly_correlated_clinical_2_3['patient_ID'] == ID][[col_1,col_2]].iloc[0])
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
# store the slope for a patient
slopes.append(slope)
# return all calculated slopes
return slopes
def regLine_slope(feature):
# get the slope of a regression line that can fit to measurments of a feature
print(feature)
slopes = []
# loop on patient IDs
for idx in range(len(patient_IDs)):
# create a dataframe for feature values over days of the study
ID = patient_IDs[idx]
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay[FeatureDay['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of the study
df.sort(col_2, inplace = True)
# x: days of study, y: feature values
x = df[col_2]
y = df[col_1]
# create regression line, calculate the slope
# check for features that miss value over all days of study/ have just one value available
if (len(y.unique()) == 1) & (np.isnan(y.unique()).sum() == 1):
slopes.append(np.nan)
else:
y,x = remove_outliers(y,x)
if len(y) == 0:
slopes.append(np.nan)
else:
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
slopes.append(slope)
# return the calculated slopes
return slopes
# create a new data frame with all the information that we want to find the correlation between them
# slopes of the regression lines fitted to feature measurements and associated clinical measurments in visits 2&3
summary_of_slopes_df = pd.DataFrame(patient_IDs,columns=['patient_IDs'])
# regression line and slope for at-home features (structured activity)
for feature in at_home_features_highly_correlated:
slopes = regLine_slope(feature)
summary_of_slopes_df[feature] = slopes
# regression line and slope for clinical visits
for feature in at_home_features_highly_correlated:
col_1 = feature.replace('at_home','clinic_2')
col_2 = feature.replace('at_home','clinic_3')
max_y,max_x = standardize_axis(feature)
summary_of_slopes_df[feature.replace('at_home','clinic_2_3')] = clinical_visits_slope(col_1,col_2,max_x)
summary_of_slopes_df.head()
# heatmap of correlation between slopes
plt.figure(figsize=(23,23))
sns.heatmap(summary_of_slopes_df.corr(),annot=True)
# make the low corelated values to mask from the heatmap
def func(x):
if (x > -0.5) & (x < 0.50):
return True
else:
return False
df = summary_of_slopes_df.corr()
# make a mask dataframe
mask = df.isnull()
for name in list(df.columns):
mask[name] = df[name].apply(func)
# mask top half of the map as it is symmetrical
(row,col) = mask.shape
for i in range(row):
for j in range(col):
if j > i:
mask.iloc[i][j] = True
# plot the heatmap
plt.figure(figsize=(25, 25))
sns.heatmap(df, mask=mask, annot=True)
def corr_and_p_values(df):
# calculating p values along with correlation values for a dataframe
# input: a dataframe
# output: the correlation and p values between each pair of clolumns in input dataframe
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
coeffmat = np.zeros((df.shape[1], df.shape[1]))
pvalmat = np.zeros((df.shape[1], df.shape[1]))
for i in range(df.shape[1]):
for j in range(df.shape[1]):
column1 = df.columns[i]
column2 = df.columns[j]
# drop all null values
# pearsonr cannot handle null values in data(will return null for the whole)
df_clean = df[[column1, column2]].dropna()
df_clean.columns = [column1 + '_1', column2 + '_2']
corrtest = pearsonr(df_clean[column1 + '_1'], df_clean[column2 + '_2'])
coeffmat[i,j] = corrtest[0]
pvalmat[i,j] = corrtest[1]
# make a new dataframe with correlation and p values included
dfcoeff = pd.DataFrame(coeffmat, columns=df.columns, index=df.columns)
dfpvals = pd.DataFrame(pvalmat, columns=df.columns, index=df.columns)
# return the new dataframe
return dfcoeff, dfpvals
def extract_high_corr_low_p_value(dfcoeff,dfpvals,corr_th,p_th):
# look at the p values of highly correlated features
# input: correlation values and p values data frames, threshold for correlation and p values
# output: highly correlated features with low p values
feature_1 = []
feature_2 = []
corr_values = []
p_values = []
for col in list(dfcoeff.columns):
for row in list(dfcoeff.index):
# look for high correlation with low p values
if (np.abs(dfcoeff[col][row]) >= corr_th) & (dfpvals[col][row] <= p_th):
# ignore the diagonal features
if (row != col):
# look for features that their device values has correlation with clinic values
if ((row.split('_clinic_2_3')[0]) == (col.split('_at_home')[0])) | ((col.split('_clinic_2_3')[0]) == (row.split('_at_home')[0])):
corr_val = dfcoeff[col][row]
p_val = dfpvals[col][row]
feature_1.append(col)
feature_2.append(row)
corr_values.append(corr_val)
p_values.append(p_val)
res = pd.DataFrame()
res['feature_1'] = feature_1
res['feature_2'] = feature_2
res['corr_values'] = corr_values
res['p_values'] = p_values
return res
# look at features with high correlations and low p values
# define thresholds for pvalue and correlation value
corr_th = 0.5
p_th = 0.05
df = summary_of_slopes_df.drop('patient_IDs',axis=1)
dfcoeff, dfpvals = corr_and_p_values(df)
res = extract_high_corr_low_p_value(dfcoeff,dfpvals,corr_th,p_th)
dfcoeff.head()
dfpvals.head()
res
# have a closer look into features for which the rate of change found highly correlated to that of clinical visits
# plot feature values and clinical measurements on a same graph
at_home_features_highly_correlated
# turn_vel_max_at_home
feature = at_home_features_highly_correlated[6]
plot_feature_across_days_and_clinical_visitis(feature)
# zx_nondominant_median_at_home
feature = at_home_features_highly_correlated[13]
plot_feature_across_days_and_clinical_visitis(feature)
# zx_nondominant_num_correct_at_home
feature = at_home_features_highly_correlated[14]
plot_feature_across_days_and_clinical_visitis(feature)
# for all 19 highly correlated at home features (structured activity)
# plot variation of a feature, regression lines, over the course of study
# plot regline for clinical visits 2 and 3
feature = at_home_features_highly_correlated[0]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[1]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[2]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[3]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[4]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[5]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[6]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[7]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[8]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[9]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[10]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[11]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[12]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[13]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[14]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[15]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[16]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[17]
plot_feature_across_days_and_clinical_visitis(feature)
feature = at_home_features_highly_correlated[18]
plot_feature_across_days_and_clinical_visitis(feature)